Author: Szymon Pawłowski¶

Our task is to analyze a set of medical data and create a model to classify a potential patient due to the fact of stroke.

A stroke is a set of clinical symptoms associated with the sudden onset of focal or generalized brain dysfunction, resulting from a disruption of cerebral circulation and persisting for more than 24 hours.

TOC ¶

  • Libraries and functions
  • Data
  • EDA
    • stroke
    • id
    • gender
    • age
    • hypertension
    • heart_disease
    • ever_married
    • work_type
    • residence_type
    • avg_glucose_level
    • bmi
    • smoking_status
  • Feature engineering
    • bmi
    • gender
    • age
    • ever_married
    • work_type
    • residence_type
    • avg_glucose_level
    • smoking_status
  • Correlation analysis
  • Polynomial Features
  • Balancing data set
    • SMOTE
    • SMOTE+Tomek
    • SMOTE+ENN
  • Metrics
  • Modelling
    • Selecting the balancing technique
    • Feature selection using LASSO
    • Hyperparameters tuning
    • Threshold $p$
  • Example pipeline

Libraries and functions ¶

TOC

In [1]:
import pandas as pd
import numpy as np
import joblib

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel

from yellowbrick import classifier
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier.rocauc import roc_auc

import warnings
warnings.filterwarnings("ignore")
In [2]:
def plotting(data: pd.DataFrame, x: str, type: str, y=None, cl = 'stroke'):
  """
    Function for plotting data in specific way

  Args:
    data - dataset to plot

    x - column for x-axis
    
    y - column for y-axis
    
    type - type of plot 
      values: {'hist', 'box', 'hist_box', 'class_hist', 'class_box', 'class_hist_box', 'scatter', 'heatmap', 'lineplot'} 

    cl - class

  Returns:
    Plot in plotly
  """

  def plot_class_hist():
    """
      Plots histogram with stroke classes in plotly
    """
    
    fig = go.Figure()
    fig.add_trace(go.Histogram(x = data.loc[data[cl]==0, x], name='No '+cl))
    fig.add_trace(go.Histogram(x = data.loc[data[cl]==1, x], name=cl))

    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.75)
    
    return fig
  
  def plot_class_box():
    """
      Plots boxplot with stroke classes in plotly
    """
    
    fig = go.Figure()
    fig.add_trace(go.Box(y = data.loc[data[cl]==0, x], name='No '+cl))
    fig.add_trace(go.Box(y = data.loc[data[cl]==1, x], name=cl))

    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.75)
    
    return fig
    

  if type=='hist':
    fig = px.histogram(data, x=x)
    fig.show()

  elif type=='box':
    fig = px.box(data, x=x)
    fig.show()
  
  elif type=='scatter':
    data['temp'] = data[cl].astype('str')

    fig = px.scatter(data, x=x, y=y, color='temp')
    fig.update_layout(title_text="Scatterplot for " + x)
    fig.update_traces(marker_size=10)
    
    fig.show()
  
  elif type=='heatmap':
    corr = data.corr()
    f, ax = plt.subplots(figsize=(16, 12))
    mask = np.triu(np.ones_like(corr, dtype=bool))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

  elif type=='hist_box':
    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(go.Histogram(x = data[x], name=x), row=1,col=1)
    fig.add_trace(go.Box(y = data[x], name=x), row=1, col=2)
    
    fig.update_layout(barmode="overlay")
    fig.update_traces(opacity=0.7, row=1, col=1)
    fig.update_layout(title_text="Histogram and Boxplot for " + x)
    fig.show()

  elif type=='class_hist':
    
    fig = plot_class_hist()
    fig.show()
  
  elif type=='class_box':
    
    fig = plot_class_box()
    fig.show()

  elif type=='class_hist_box':
    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(go.Histogram(x = data.loc[data[cl]==0, x], name='No '+cl), row=1,col=1)
    fig.add_trace(go.Histogram(x = data.loc[data[cl]==1, x], name=cl), row=1, col=1)
    fig.add_trace(go.Box(y = data.loc[data[cl]==0, x], name='No '+cl), row=1, col=2)
    fig.add_trace(go.Box(y = data.loc[data[cl]==1, x], name=cl), row=1, col=2)

    fig.update_layout(barmode="overlay")
    fig.update_traces(opacity=0.7, row=1, col=1)
    fig.update_layout(title_text="Histogram and Boxplot with classes for " + x)
    fig.show()

  elif type=='lineplot':
    fig = px.line(data, x=x, y=y, color=cl)
    fig.show()

  else:
    raise NotImplementedError('Not implemented!')

  return 0
In [3]:
def iqr_outliers(df: pd.DataFrame, col: str):
  """
  Function for managin outliers in given column from dataframe using IQR method. 
  In descriptive statistics, the interquartile range (IQR) is a measure of statistical dispersion, which is the spread of the data. 
  It is defined as the difference between the 75th and 25th percentiles of the data.

  Args:
    df - dataset to remove outliers

    col - column for removing outliers
    
  Returns:
    Whole dataframe with specified column cleaned from outliers using IQR method

  """
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1
  df.loc[df[col] <= Q1 - 1.5*IQR, col] = df[col].quantile(0.1)
  df.loc[df[col] >= Q3 + 1.5*IQR, col] = df[col].quantile(0.9)

  return df
In [4]:
def balance_data(X, y, algorithm, cat_features, strategy=1, seed = 123, k_neighbors = 5, n_neighbors=3):
  """
  Function for balancing dataset using specified algorithm and sampling strategy. 

  Args:
    X - matrix of features

    y - vector of target variable observations

    algorithm - algorithm for balancing dataset
      values: {'SMOTE', 'SMOTETOMEK', 'SMOTEENN'}

    strategy - sampling information to resample the data set

    Optional **kwargs:
      k_neighbors - number of neighbours for SMOTE algorithm
      n_neighbors - number of neighbours for ADASYN and ENN algorithms

  Returns:
    Whole balanced dataframe 
  """

  ft = []
  for feat in cat_features:
    ft.append(X.columns.get_loc(feat))

  #sm = SMOTE(sampling_strategy = strategy, k_neighbors = k_neighbors)
  sm = SMOTENC(random_state=seed, categorical_features=ft, sampling_strategy = strategy, k_neighbors=k_neighbors)

  if algorithm == 'SMOTE':
    bs = sm

  elif algorithm == 'SMOTETOMEK':
    
    bs = SMOTETomek(random_state = seed, sampling_strategy = strategy, smote= sm)

  elif algorithm == 'SMOTEENN':
    enn = EditedNearestNeighbours(n_neighbors = n_neighbors)

    bs = SMOTEENN(sampling_strategy = strategy, smote=sm, enn = enn)
  
  else:
    raise NotImplementedError('Not implemented!')

  X_bs,  y_bs = bs.fit_resample(X, y)

  df_bs = X_bs.copy()
  df_bs['stroke'] = y_bs

  X_bs = X_bs.drop(columns='index', axis=1)

  print("Liczba wszystkich obserwacji: ", len(y_bs))
  print("\nLiczba obserwacji z klasy mniejszościowej: ", len(y_bs[y_bs==1]))
  print("\nProcent klasy mniejszościowej do całości po zbalansowaniu: ", len(y_bs[y_bs == 1])/len(y_bs))
  
  return df_bs, X_bs, y_bs
In [5]:
#Create class with threshold p selection

class LogisticRegressionWithThreshold(LogisticRegression):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return LogisticRegression.predict(self, X)
        else:
            y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold

class SupportVectorMachineWithThreshold(SVC):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return SVC.predict(self, X)
        else:
            y_scores = SVC.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold

class RandomForestClassifierWithThreshold(RandomForestClassifier):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return RandomForestClassifier.predict(self, X)
        else:
            y_scores = RandomForestClassifier.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold
In [6]:
def preprocess_data(X_train, X_test, columns_to_standardize, cat_features, scaler=True, ohe=True, test_only=False, sc=None):
  """
  Function preprocesses data by standardizing numerical features and one hot encoding categorical features.

  Args:
    X_train - train dataset of features

    X_test - test dataset of features

    columns_to_standardize - columns to standardize

    cat_features - column for OHE

    scaler - boolean value specifies if standardizing should be performed

    ohe - boolean value specifies if OHE should be performed

    test_only - flag if only test set should be preprocessed

    sc - scaler

  Returns
    X 
      Preprocessed train data

    X_t
      Preprocessed test data
  """
  def ohe(df: pd.DataFrame, cat_features):
    df_ohe = df[cat_features].astype('str')
    dummies = pd.get_dummies(df_ohe)

    df = df.drop(columns=cat_features, axis=1)
    df = pd.merge(df, dummies, left_index=True, right_index=True)
    
    return df
  
  if test_only==True:
    X_t = X_test.copy()

    if scaler == True:
      X_t[columns_to_standardize] = sc.transform(X_t[columns_to_standardize])
  
    if ohe == True:
      X_t = ohe(X_t, cat_features)

    return X_t

  else:

    X = X_train.copy()
    X_t = X_test.copy()

    sc = preprocessing.StandardScaler()
    
    if scaler == True:
      X[columns_to_standardize] = sc.fit_transform(X_train[columns_to_standardize])
      X_t[columns_to_standardize] = sc.transform(X_t[columns_to_standardize])
    
    if ohe == True:
      X = ohe(X, cat_features)
      X_t = ohe(X_t, cat_features)

    return X, X_t, sc
In [7]:
def fit_pred_score(X_train, y_train, X_test, y_test, model, model_name, dataset_name, scaler=True, ohe = False, columns_to_standardize = None, cat_features = None, visualize_train = True, visualize_test = True, show_coefficients = False, threshold = None):
  """
  Function for training and testing given classifier. 
  It plots classification reports for train and test data and might display coefficients of predictors. 
  Returns report in dataframe format for given model and dataset with metrics from classification report.

  Args:

    X_train, y_train, X_test, y_test - training and testing data with associated labels

    model - model to train and test

    model_name - name of the model for the report

    dataset_name - dataset name used for the report

    scaler - if features should be scaled

    columns_to_standardize - columns which should be scaled if scaler = True

    visualize_train - if classification report for training data should be visualized

    visualize_test - if classification report for testing data should be visualized

    show_coefficients - if coefficients of features should be displayed

    threshold - value of 'p' for Logistic Regression model

  Returns:

    report - dataframe with metrics from classification report with associated dataset and model name
  """

  X, X_t, standard_scaler = preprocess_data(X_train, X_test, columns_to_standardize = columns_to_standardize, cat_features = cat_features, scaler = scaler, ohe = ohe)

  model.fit(X, y_train)

  if threshold == None:
    y_pred_train = model.predict(X)
    y_pred = model.predict(X_t)
  else:
    y_pred_train = model.predict(X, threshold = threshold)
    y_pred = model.predict(X_t, threshold = threshold)

  classes=['No stroke', 'Stroke']

  #plot classification report for training data
  if visualize_train == True:
    visualizer = ClassificationReport(
          model, classes=classes)
    visualizer.fit(X, y_train)
    visualizer.score(X, y_train)
    visualizer.show()

  if visualize_test == True:
    #plot classification report for test data
    classifier.classification_report(model, X, y_train, X_t, y_test, classes=classes)

  if show_coefficients == True:
    coef_dict = {}
    for coef, feat in zip(model.coef_[0,:],X.columns):
      print(feat, " = ", coef)

  report = pd.DataFrame()
  for t, y, y_approx in zip(['train','test'], [y_train, y_test], [y_pred_train, y_pred]):
    results = pd.DataFrame(metrics.classification_report(y, y_approx, target_names = classes, output_dict=True))
    results['metric'] = results.index
    results.reset_index(inplace=True, drop=True)
    results['dataset_name'] = dataset_name
    results['model_name'] = model_name
    results['split'] = t
    report = pd.concat([report, results])

  
  
  return report
In [8]:
def transform_report_to_plot(dt : pd.DataFrame, x : str, cl = 'metric', train_test = False):
  """
  Function transforms report dataset for plotting in a way that for F1-score macro avg value is taken, for precision the values for no stroke and for recall the values for stroke. 
  
  Args:
    dt - filtered dataset

    x - axis

    cl - class to display

    train_test - flag which specifies if data should be transformed to differences between train and test

  Returns:
    df - transformed report dataframe
  """
  if train_test == True:
    prefix = 'train_test_diff'
  else:
    prefix = ''
  #cols_to_select
  f1_s = dt.loc[(dt.metric == 'f1-score'),[prefix+'macro avg', cl, x]]
  precision_s = dt.loc[(report.metric == 'precision'),[prefix+'Stroke', cl, x]]
  precision_s[cl] = "Precision (Stroke)"
  recall_s = dt.loc[(dt.metric == 'recall'),[prefix+'Stroke', cl, x]]
  recall_s[cl] = "Recall (Stroke)"
  precision_ns = dt.loc[(report.metric == 'precision'),[prefix+'No stroke', cl, x]]
  precision_ns[cl] = "Precision (No stroke)"
  recall_ns = dt.loc[(dt.metric == 'recall'),[prefix+'No stroke', cl, x]]
  recall_ns[cl] = "Recall (No stroke)"

  columns = ['value', cl, '']
  for d in [f1_s, precision_s, recall_s, precision_ns, recall_ns]:
    d.columns = columns

  df = pd.concat([f1_s, precision_s, recall_s, precision_ns, recall_ns])
  return df
In [9]:
def prepare_diff_dataset(df : pd.DataFrame(), by: str, cols_for_diff = ['No stroke',	'Stroke',	'accuracy',	'macro avg',	'weighted avg']):
  """
  Takes the report dataset and returns dataset with calculated differences in metric between test and train by specified grouping.

  Args:

    df - dataset 

    by - specified grouping

    cols_for_diff - cols to calculate differences

  Returns:
    df_diff
      Dataset with calculated differences
  """
  df_diff = df.copy()

  for col in cols_for_diff:
    df_diff['train_test_diff'+col] = None
    for ds in df_diff[by].unique():
      for m in df_diff.metric.unique():
        filter = (df_diff[by] == ds) & (df_diff.metric == m)
        df_diff.loc[filter, 'train_test_diff'+col] = np.abs(df_diff.loc[filter, col].diff())

  return df_diff

Data ¶

TOC

In [10]:
url = 'https://github.com/maju116/Statistics_II_GUT/raw/main/PROJECT/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(url)

df.head(5)
Out[10]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
In [12]:
df.describe().transpose()
Out[12]:
count mean std min 25% 50% 75% max
id 5110.0 36517.829354 21161.721625 67.00 17741.250 36932.000 54682.00 72940.00
age 5110.0 43.226614 22.612647 0.08 25.000 45.000 61.00 82.00
hypertension 5110.0 0.097456 0.296607 0.00 0.000 0.000 0.00 1.00
heart_disease 5110.0 0.054012 0.226063 0.00 0.000 0.000 0.00 1.00
avg_glucose_level 5110.0 106.147677 45.283560 55.12 77.245 91.885 114.09 271.74
bmi 4909.0 28.893237 7.854067 10.30 23.500 28.100 33.10 97.60
stroke 5110.0 0.048728 0.215320 0.00 0.000 0.000 0.00 1.00

We have a total of 12 columns of 5110 observations of which one is ('stroke') the target variable and we will try to model it. It is a binary variable indicating whether a particular patient had a stroke or not so it is a typical classification task. In addition to this, there are the other 11 variables:

  • variable id integer type assigning a specific patient is unlikely to add any informational value, but it will allow us to see if a patient is doubled in history which could indicate an increased risk of stroke;
  • variable gender type object is a binary variable indicating the gender of the patient, it will certainly need to be coded. It is expected that it will give important information for the target variable;
  • variable age of type float is a numeric variable indicating the age of the patient and can also be expected to be important in modeling. Here we have a range of values from 25 to 82 without rather strong tails;
  • variable hypertension of integer type is a binary variable indicating the patient's hypertension;
  • variable heart_disease of integer type is a binary variable indicating the incidence of heart disease;
  • variable ever_married object type is a categorical variable indicating whether the patient has ever been married;
  • variable work_type object type is a categorical variable informing about the type of employment;
  • variable residence_type object type is a categorical variable that informs about the structure of the residence;
  • variable avg_glucose_level of the float type is a numeric variable informing about the average level of glucose in the blood;
  • variable bmi of float type is a numeric variable informing about the patient's BMI. It has 201 missing data;
  • variable smoking_status object type is a categorical variable indicating whether and how often the patient smoked;
  • variable stroke of type int is a binary variable indicating whether the patient suffered a stroke and this is our target variable;

We can see that we have 211 missing data for the variable bmi, which we will deal with later in the project.

EDA ¶

TOC

stroke ¶

TOC

In [13]:
df.stroke.value_counts()
Out[13]:
0    4861
1     249
Name: stroke, dtype: int64
In [14]:
plotting(df, 'stroke', type = 'hist')
Out[14]:
0

We can see that our target variable is unbalanced where the majority class is non-stroke patients. We need to keep this in mind, and if left as is, we can't use the accuracy metric, and should use another one - for example, the harmonic mean between precision and recall, or f1-score. However, an unbalanced dataset is quite problematic, as the classifier tends to focus on the prediction of the majority class.

There are ways to balance a dataset that are part of pre-processing that is, pre-processing the data before modeling. Some of the more popular methods include:

  • undersampling - leaves all observations from the minority class and randomly eliminates objects from the majority class.
  • oversampling - leaves all observations from the majority class and randomly replicates elements from the minority class

With undersampling, we can get rid of informationally relevant observations. Additionally, our dataset is small, so we wouldn't want to be limited to just 249 observations. Oversampling, on the other hand, can lead to over-fitting the model to our data. There is also a mixed approach using both undersampling and oversampling, but it is difficult to determine the optimal ratio between the approaches. We will try to empirically test the best method and restrict ourselves to it, of course, based not on simple sampling, but on available and known algorithms for balancing datasets.

id ¶

TOC

In [15]:
print("Number of duplicate patient ID values: ", len(df[df.id.duplicated()]))
Number of duplicate patient ID values:  0

In our dataset, each patient is entered uniquely, so the ID variable is unlikely to be useful to us for anything else, and we will be able to leave it for later stages.

gender ¶

TOC

In [16]:
plotting(data = df, x='gender',  type='class_hist')
Out[16]:
0
In [17]:
df.gender.value_counts()
Out[17]:
Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

There are 3 values in the gender variable (male, female, other). The value 'Other' occurs only once, so it can be treated as missing data. Depending on the values of the other variable for this observation, we will either replace it with male or female (as the one that occurs most often) or remove it.

age ¶

TOC

In [18]:
plotting(data = df, x='age',  type='hist_box')
Out[18]:
0
In [19]:
plotting(data = df, x='age',  type='class_hist_box')
Out[19]:
0

We can observe that the stroke patients in our collection are mainly elderly (60-80 years old). In contrast, people without stroke are fairly evenly distributed, with a median age of 43 years, where it is 71 years for people with stroke. In addition, there are outlier variables in the group of people with stroke. The distribution of the variable itself is close to a uniform distribution with a "coarse" estimate.

hypertension ¶

TOC

In [20]:
plotting(data = df, x='hypertension',  type='class_hist')
Out[20]:
0

It can be seen that the vast majority of patients do not have hypertension. Moreover, despite the lack of a balanced dataset, more stroke patients did not have hypertension at the same time.

heart_disease ¶

TOC

In [21]:
plotting(data = df, x='heart_disease',  type='class_hist')
Out[21]:
0

Similar situation as for the variable hypertension. The vast majority of patients do not have heart disease and, despite the imbalance, most people with stroke also do not have heart disease.

ever_married ¶

TOC

In [22]:
plotting(data = df, x='ever_married',  type='class_hist')
Out[22]:
0

The ever_married variable is a categorical variable that will need to be coded at a value of (0,1). A larger proportion of patients were ever married and it is in this group that stroke occurs in greater numbers.

work_type ¶

TOC

In [23]:
plotting(data = df, x='work_type',  type='class_hist')
Out[23]:
0

The work_type variable, which determines the type of work performed, is also a categorical variable, which we will code into numeric values. The largest number of patients is in the 'private' category, followed by an equally quantitative distribution between 'children', 'self_employed', 'govt_job'. A negligible number have never worked. The largest number of strokes is among the private and self-employed sectors. A small portion in the public sector. Single values are also found in the children's group, but from previous findings, these are most likely outliers. One could also try to categorize this variable and separate the burdened and unburdened sectors in terms of stroke risk.

residence_type ¶

TOC

In [24]:
plotting(data = df, x='Residence_type',  type='class_hist')
Out[24]:
0

The variable specifies the area of residence with a rural and urban split. It can be seen that the split is even and there is a similar proportion of post-stroke to non-stroke people in each division, so this variable probably carries little information for a potential model.

avg_glucose_level ¶

TOC

In [25]:
plotting(data = df, x='avg_glucose_level',  type='hist_box')
Out[25]:
0
In [26]:
plotting(data = df, x='avg_glucose_level',  type='class_hist_box')
Out[26]:
0

We can observe that there are the largest number of patients with average glucose levels in the range of 77.24 - 1st quantile, 114.09 - 2nd quantile, that is, with normal or pre-diabetic levels. This conclusion, however, is generalizable, as the range varies somewhat depending on the time or method of testing, as well as the age of the person being tested. Those with higher mean blood glucose levels are outliers for the non-stroke group, while for the rest of the subjects they fall into the IV quantile. So removing these observations, or changing their values, would result in the loss of perhaps significant information. Instead, we will try to take advantage of this relationship by creating a new categorical variable reporting the average glucose range (normal, pre-diabetic, diabetic) and imputing new values, as well as transforming the entire variable.

bmi ¶

TOC

In [27]:
plotting(data = df, x='bmi',  type='hist_box')
Out[27]:
0
In [28]:
plotting(data = df, x='bmi',  type='class_hist_box')
Out[28]:
0

The distribution of the variable bmi resembles a normal distribution with a heavier right tail. There are also outliers for bmi > 47.5. Most patients are between 23.5 and 33.1, meaning they are normal weight or overweight to the first degree. While the variable itself may not carry much information due to the fact that the distribution of values is similar for people with and without stroke, it is nevertheless possible that there are some significant interactions between the other variables such as mean blood glucose levels. In the context of outlier data, it is also possible to create a categorical variable according to the BMI categories.

In addition, the BMI variable contains data gaps that need to be addressed. There are a number of methods to do this, among others.

  • removing observations
  • filling them with descriptive statistics such as mean, median
  • using ML algorithms such as KNN

Since data gaps are few (only 4%), filling them with descriptive statistics as a simple but effective measure. As mentioned, the distribution of the variable is close to normal so it does not matter much to choose between the median and the mean, but given the slight right tail the median will be chosen. For outlier observations we will use the IQR method.

smoking_status ¶

TOC

In [29]:
plotting(data = df, x='smoking_status',  type='class_hist_box')
Out[29]:
0

It's hard to draw broader conclusions here given the fairly large 'Unknown' group, The largest group of patients had never smoked cigarettes assuming that in the 'Unknown' group the values would be fairly evenly distributed. You can see the distinction for stroke patients that they are mostly non-smokers or occasional smokers. Let's check the age of the people for the Unknown group, perhaps we will naturally separate the underage group.

In [30]:
plotting(data=df[(df.age <= 16)], x='smoking_status', type='class_hist')
Out[30]:
0

We can see that for minors the value of smoking_status is mostly equal to 'Unknown'. Something needs to be done with this group of people, however, you can't just remove these observations, as stroke can also affect children. On the one hand, it is possible to classify these people into the 'Never smoked' group, but it is highly likely that in reality some people may 'smoke'. So we will create a new 'Underage' category for these patients.

Feature engineering ¶

TOC

In feature engineering, we will create new variables from existing ones, deal with missing data or outlier observations. For the latter, we will use the IQR interquartile range method. This is the difference between the third and first quartiles. Since between these quartiles is, by definition, 50% of all observations (located centrally in the distribution), so the greater the width of the interquartile range, the greater the variation in the trait. In addition, at the end we will perform balancing of our set using 4 techniques: SMOTE, SMOTE+Tomek, SMOTE ENN.

In [31]:
#deep data copy

df_clear = df.copy()

bmi ¶

TOC

  • imputation of missing data using the median
  • creating a new categorical variable based on an existing numerical variable and coding it. For categorization, we will use the categories proposed by the 'Centers for Disease Control and Prevention' (https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html). image.png
  • adding a numerical variable without outliers by detecting them with the IQR method and restricting their values to 0.9 quantile and 0.1 quantile
  • log-transformation of the variable and detecting them with the IQR method and limiting their values to 0.9 quantile and 0.1 quantile
In [32]:
#imputation of the median in NA places
df_clear['bmi_no_nan'] = df_clear.bmi.fillna(df_clear.bmi.median())

#numeric variable without outliers
df_clear['bmi_no_outliers'] = df_clear.bmi_no_nan
df_clear = iqr_outliers(df_clear, 'bmi_no_outliers')

#log-transformation without outliers
df_clear['log_bmi_no_outliers'] = np.log(df_clear.bmi_no_nan)
df_clear = iqr_outliers(df_clear, 'log_bmi_no_outliers')

#new categorical variable according to BMI indicators
df_clear.loc[(df_clear.bmi_no_outliers < 18.5), 'bmi_categorical'] = 'underweight'
df_clear.loc[(df_clear.bmi_no_outliers >= 18.5) & (df_clear.bmi_no_outliers < 24.9), 'bmi_categorical'] = 'normal'
df_clear.loc[(df_clear.bmi_no_outliers >= 24.9) & (df_clear.bmi_no_outliers < 29.9), 'bmi_categorical'] = 'overweight'
df_clear.loc[(df_clear.bmi_no_outliers >= 29.9) & (df_clear.bmi_no_outliers < 34.9), 'bmi_categorical'] = 'obese'
df_clear.loc[(df_clear.bmi_no_outliers >= 34.9), 'bmi_categorical'] = 'extremely obese'

#interval median coding
df_clear.loc[df_clear.bmi_categorical == 'underweight', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'underweight','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'normal', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'normal','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'overweight', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'overweight','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'obese', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'obese','bmi_no_outliers'].median())
df_clear.loc[df_clear.bmi_categorical == 'extremely obese', 'bmi_categorical_encoded'] = int(df_clear.loc[df_clear.bmi_categorical == 'extremely obese','bmi_no_outliers'].median())
df_clear.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5110 non-null   int64  
 1   gender                   5110 non-null   object 
 2   age                      5110 non-null   float64
 3   hypertension             5110 non-null   int64  
 4   heart_disease            5110 non-null   int64  
 5   ever_married             5110 non-null   object 
 6   work_type                5110 non-null   object 
 7   Residence_type           5110 non-null   object 
 8   avg_glucose_level        5110 non-null   float64
 9   bmi                      4909 non-null   float64
 10  smoking_status           5110 non-null   object 
 11  stroke                   5110 non-null   int64  
 12  bmi_no_nan               5110 non-null   float64
 13  bmi_no_outliers          5110 non-null   float64
 14  log_bmi_no_outliers      5110 non-null   float64
 15  bmi_categorical          5110 non-null   object 
 16  bmi_categorical_encoded  5110 non-null   float64
dtypes: float64(7), int64(4), object(6)
memory usage: 678.8+ KB
In [33]:
for col, types in zip(['bmi_no_nan', 'bmi_categorical', 'bmi_no_outliers', 'log_bmi_no_outliers'], ['class_hist_box','class_hist','class_hist_box','class_hist_box']):
  plotting(data = df_clear, x=col, type=types)

We can see that filling in data gaps resulted in a peak at the median point which is a natural effect of the method used.

In addition, the overweight group is the most numerous, and this is also where the most stroke patients are found. In the case of removing outliers, we can observe getting rid of the right-hand tail, while the use of log-transformation seems to have no significant effect other than rescaling, which will be unnecessary since we will be standardizing the variables at a later stage, so we will discard this variable.

Moreover, we can see that the assignment of quantile limits did not eliminate outlier data in the stroke group. In this situation, we can remove observations specifically for this group, but these are not isolated cases and the group is not very large anyway. In this case, due to the further slight cluttering of the variable by these observations, we will stay with the categorical variable binning to the appropriate group. This is a fairly common solution for BMI, as it is a common body weight index.

gender ¶

TOC

  • due to the fact that this is only one observation and it is difficult to substitute it properly (the other variables are heavily scattered, so they do not carry enough information for imputation) will be removed
  • coding of the variable
In [34]:
df_clear = df_clear.loc[df_clear.gender != 'Other',:]
df_clear.loc[df_clear.gender == 'Female', 'is_female'] = 1
df_clear.loc[df_clear.gender == 'Male', 'is_female'] = 0
df_clear.is_female = df_clear.is_female.astype('int')

df_clear.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5109 non-null   int64  
 1   gender                   5109 non-null   object 
 2   age                      5109 non-null   float64
 3   hypertension             5109 non-null   int64  
 4   heart_disease            5109 non-null   int64  
 5   ever_married             5109 non-null   object 
 6   work_type                5109 non-null   object 
 7   Residence_type           5109 non-null   object 
 8   avg_glucose_level        5109 non-null   float64
 9   bmi                      4908 non-null   float64
 10  smoking_status           5109 non-null   object 
 11  stroke                   5109 non-null   int64  
 12  bmi_no_nan               5109 non-null   float64
 13  bmi_no_outliers          5109 non-null   float64
 14  log_bmi_no_outliers      5109 non-null   float64
 15  bmi_categorical          5109 non-null   object 
 16  bmi_categorical_encoded  5109 non-null   float64
 17  is_female                5109 non-null   int32  
dtypes: float64(7), int32(1), int64(4), object(6)
memory usage: 738.4+ KB

age ¶

TOC

  • getting rid of outliers for the 'stroke' class, as you can see the differentiation for this group and leaving them could introduce unnecessary noise
In [35]:
#board of stroke patients
df_stroke = df_clear[df_clear.stroke == 1].copy()
#getting rid of outliers
df_stroke = iqr_outliers(df=df_stroke, col='age')
#reconcatenation of the set
df_clear = pd.concat([df_clear[df_clear.stroke==0], df_stroke])
#plot
plotting(data=df_clear, x='age', type='class_hist_box')
#summary
df_clear.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5109 non-null   int64  
 1   gender                   5109 non-null   object 
 2   age                      5109 non-null   float64
 3   hypertension             5109 non-null   int64  
 4   heart_disease            5109 non-null   int64  
 5   ever_married             5109 non-null   object 
 6   work_type                5109 non-null   object 
 7   Residence_type           5109 non-null   object 
 8   avg_glucose_level        5109 non-null   float64
 9   bmi                      4908 non-null   float64
 10  smoking_status           5109 non-null   object 
 11  stroke                   5109 non-null   int64  
 12  bmi_no_nan               5109 non-null   float64
 13  bmi_no_outliers          5109 non-null   float64
 14  log_bmi_no_outliers      5109 non-null   float64
 15  bmi_categorical          5109 non-null   object 
 16  bmi_categorical_encoded  5109 non-null   float64
 17  is_female                5109 non-null   int32  
dtypes: float64(7), int32(1), int64(4), object(6)
memory usage: 738.4+ KB

ever_married ¶

TOC

  • encoding of the variable
In [36]:
df_clear.loc[df_clear.ever_married == 'Yes', 'is_married'] = 1
df_clear.loc[df_clear.ever_married == 'No', 'is_married'] = 0
df_clear.is_married = df_clear.is_married.astype('int')

df_clear.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5109 non-null   int64  
 1   gender                   5109 non-null   object 
 2   age                      5109 non-null   float64
 3   hypertension             5109 non-null   int64  
 4   heart_disease            5109 non-null   int64  
 5   ever_married             5109 non-null   object 
 6   work_type                5109 non-null   object 
 7   Residence_type           5109 non-null   object 
 8   avg_glucose_level        5109 non-null   float64
 9   bmi                      4908 non-null   float64
 10  smoking_status           5109 non-null   object 
 11  stroke                   5109 non-null   int64  
 12  bmi_no_nan               5109 non-null   float64
 13  bmi_no_outliers          5109 non-null   float64
 14  log_bmi_no_outliers      5109 non-null   float64
 15  bmi_categorical          5109 non-null   object 
 16  bmi_categorical_encoded  5109 non-null   float64
 17  is_female                5109 non-null   int32  
 18  is_married               5109 non-null   int32  
dtypes: float64(7), int32(2), int64(4), object(6)
memory usage: 758.4+ KB

work_type ¶

TOC

  • encoding of the variable
In [37]:
le = preprocessing.LabelEncoder()
le.fit(df_clear.work_type)
df_clear['work_type_encoded'] = le.transform(df_clear.work_type)
df_clear.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5109 non-null   int64  
 1   gender                   5109 non-null   object 
 2   age                      5109 non-null   float64
 3   hypertension             5109 non-null   int64  
 4   heart_disease            5109 non-null   int64  
 5   ever_married             5109 non-null   object 
 6   work_type                5109 non-null   object 
 7   Residence_type           5109 non-null   object 
 8   avg_glucose_level        5109 non-null   float64
 9   bmi                      4908 non-null   float64
 10  smoking_status           5109 non-null   object 
 11  stroke                   5109 non-null   int64  
 12  bmi_no_nan               5109 non-null   float64
 13  bmi_no_outliers          5109 non-null   float64
 14  log_bmi_no_outliers      5109 non-null   float64
 15  bmi_categorical          5109 non-null   object 
 16  bmi_categorical_encoded  5109 non-null   float64
 17  is_female                5109 non-null   int32  
 18  is_married               5109 non-null   int32  
 19  work_type_encoded        5109 non-null   int32  
dtypes: float64(7), int32(3), int64(4), object(6)
memory usage: 778.3+ KB

residence_type ¶

TOC

  • encoding of the variable
In [38]:
df_clear.loc[df_clear.Residence_type == 'Rural', 'is_rural'] = 1
df_clear.loc[df_clear.Residence_type == 'Urban', 'is_rural'] = 0 
df_clear.is_rural = df_clear.is_rural.astype('int')

df_clear.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5109 non-null   int64  
 1   gender                   5109 non-null   object 
 2   age                      5109 non-null   float64
 3   hypertension             5109 non-null   int64  
 4   heart_disease            5109 non-null   int64  
 5   ever_married             5109 non-null   object 
 6   work_type                5109 non-null   object 
 7   Residence_type           5109 non-null   object 
 8   avg_glucose_level        5109 non-null   float64
 9   bmi                      4908 non-null   float64
 10  smoking_status           5109 non-null   object 
 11  stroke                   5109 non-null   int64  
 12  bmi_no_nan               5109 non-null   float64
 13  bmi_no_outliers          5109 non-null   float64
 14  log_bmi_no_outliers      5109 non-null   float64
 15  bmi_categorical          5109 non-null   object 
 16  bmi_categorical_encoded  5109 non-null   float64
 17  is_female                5109 non-null   int32  
 18  is_married               5109 non-null   int32  
 19  work_type_encoded        5109 non-null   int32  
 20  is_rural                 5109 non-null   int32  
dtypes: float64(7), int32(4), int64(4), object(6)
memory usage: 798.3+ KB

avg_glucose_level ¶

TOC

  • creating a new categorical variable based on an existing numerical variable and coding it
  • adding a numerical variable without outliers by detecting them with the IQR method and restricting their values to 0.9 quantile and 0.1 quantile
  • log-transformation of the variable and detecting them with the IQR method and limiting their values to 0.9 quantile and 0.1 quantile
In [39]:
#new categorical variable according to BMI indicators
df_clear.loc[(df_clear.avg_glucose_level < 140), 'glucose_categorical'] = 'normal'
df_clear.loc[(df_clear.avg_glucose_level >= 140) & (df_clear.avg_glucose_level <= 199), 'glucose_categorical'] = 'prediabetes'
df_clear.loc[(df_clear.avg_glucose_level > 199), 'glucose_categorical'] = 'diabetes'

#coding
le_gl = preprocessing.LabelEncoder()
le_gl.fit(df_clear.glucose_categorical)
df_clear['glucose_categorical_encoded'] = le_gl.transform(df_clear.glucose_categorical)
df_clear.info()

#numeric variable without outliers
df_clear['glucose_no_outliers'] = df_clear.avg_glucose_level
df_clear = iqr_outliers(df_clear, 'glucose_no_outliers')

#log-transformation without outliers
df_clear['log_glucose_no_outliers'] = np.log(df_clear.avg_glucose_level)
df_clear = iqr_outliers(df_clear, 'log_glucose_no_outliers')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5109 non-null   int64  
 1   gender                       5109 non-null   object 
 2   age                          5109 non-null   float64
 3   hypertension                 5109 non-null   int64  
 4   heart_disease                5109 non-null   int64  
 5   ever_married                 5109 non-null   object 
 6   work_type                    5109 non-null   object 
 7   Residence_type               5109 non-null   object 
 8   avg_glucose_level            5109 non-null   float64
 9   bmi                          4908 non-null   float64
 10  smoking_status               5109 non-null   object 
 11  stroke                       5109 non-null   int64  
 12  bmi_no_nan                   5109 non-null   float64
 13  bmi_no_outliers              5109 non-null   float64
 14  log_bmi_no_outliers          5109 non-null   float64
 15  bmi_categorical              5109 non-null   object 
 16  bmi_categorical_encoded      5109 non-null   float64
 17  is_female                    5109 non-null   int32  
 18  is_married                   5109 non-null   int32  
 19  work_type_encoded            5109 non-null   int32  
 20  is_rural                     5109 non-null   int32  
 21  glucose_categorical          5109 non-null   object 
 22  glucose_categorical_encoded  5109 non-null   int32  
dtypes: float64(7), int32(5), int64(4), object(7)
memory usage: 858.2+ KB
In [40]:
for col, types in zip(['glucose_categorical', 'glucose_no_outliers', 'log_glucose_no_outliers'], ['class_hist','class_hist_box','class_hist_box']):
  plotting(data = df_clear, x=col, type=types)

The group of people with the normal category of average blood glucose level is the most numerous, and this is also where people with stroke are most numerous (degree I obesity was also included there).

In the context of the categorization of the variable, rather a drop in information was obtained, as the single most numerous group of people with stroke and others with similar numbers remained. Moreover, as mentioned, it's hard to categorize unequivocally because we don't know the characteristics of the sugar measurement, as well as additional diseases of patients such as diabetes.

We also see that the IQR method with imputation, instead of removing the data, further retained some outlier data. However, when using log-transformation, these are the few values that are strongly close to the upper value. With this in mind, as well as the fact that the information drops after categorization, we will limit ourselves to this variable.

smoking_status ¶

TOC

  • creation of a new group 'Underage' for minors
  • coding of the variable
In [41]:
df_clear['smoking_status_new'] = df_clear.smoking_status
df_clear.loc[df_clear.age <= 21, 'smoking_status_new'] = 'underage'
print("The 'Unknown' group represents %.2f percent of observations of the total set" % (len(df_clear[df_clear.smoking_status_new == 'underage'])/len(df_clear)))
plotting(data=df_clear, x='smoking_status_new', type='class_hist')
df_clear.info()
The 'Unknown' group represents 0.21 percent of observations of the total set
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5109 non-null   int64  
 1   gender                       5109 non-null   object 
 2   age                          5109 non-null   float64
 3   hypertension                 5109 non-null   int64  
 4   heart_disease                5109 non-null   int64  
 5   ever_married                 5109 non-null   object 
 6   work_type                    5109 non-null   object 
 7   Residence_type               5109 non-null   object 
 8   avg_glucose_level            5109 non-null   float64
 9   bmi                          4908 non-null   float64
 10  smoking_status               5109 non-null   object 
 11  stroke                       5109 non-null   int64  
 12  bmi_no_nan                   5109 non-null   float64
 13  bmi_no_outliers              5109 non-null   float64
 14  log_bmi_no_outliers          5109 non-null   float64
 15  bmi_categorical              5109 non-null   object 
 16  bmi_categorical_encoded      5109 non-null   float64
 17  is_female                    5109 non-null   int32  
 18  is_married                   5109 non-null   int32  
 19  work_type_encoded            5109 non-null   int32  
 20  is_rural                     5109 non-null   int32  
 21  glucose_categorical          5109 non-null   object 
 22  glucose_categorical_encoded  5109 non-null   int32  
 23  glucose_no_outliers          5109 non-null   float64
 24  log_glucose_no_outliers      5109 non-null   float64
 25  smoking_status_new           5109 non-null   object 
dtypes: float64(9), int32(5), int64(4), object(8)
memory usage: 977.9+ KB

The Unknown group is still quite large, but we have brought out a more diverse distribution instead and will leave it that way. This way we can see that in almost every group we have an equal proportion of people with stroke as well as without stroke. This is not a very informative variable detailing information on stroke patients. We could isolate the 'underage' group here, and aggregate the others to get a binary variable, but it will really only be based on age, not actual smoking status. So we will discard this variable in further analysis.

Correlation analysis ¶

TOC

In [42]:
# exclude categorical data
plotting(data=df_clear.loc[:,~df_clear.columns.isin(
    ['id', 'bmi_categorical_encoded','work_type_encoded','glucose_categorical_encoded'])], x='none', type='heatmap')
Out[42]:
0

Weak correlation is seen between variables and between variables and the target variable. The largest values are seen with the same variables, but after the transformation or coding which is an obvious result - only the is_married variable correlates with the age variable, but we will not, however, exclude any of them. So there is no problem of collinearity here. It is also hard to make a selection of variables on this basis, so we will rather use LASSO regularization at a later stage.

Polynomial Features ¶

TOC

Very often the input variables are interactive with each other in a non-linear way. Such interactions can later be detected and used by the model at the variable selection stage. What's more, raising some variables to a certain power can help to extract important relationships between that variable and the target variable. Mostly, raising to the second (possibly third) degree is enough, as polynomials of the fourth and higher degree are excessively flexible, have irregular shapes and can lead to overfitting. We will use polynomial features of degree two.

In [43]:
#get PolynomialFeatures transfomer
poly = preprocessing.PolynomialFeatures(2)

#select only apriori chosen variables
X_poly = df_clear.select_dtypes(['int32','int64','float32','float64'])
X_poly = X_poly.loc[:,~X_poly.columns.isin(['id', 'bmi_no_nan','bmi','avg_glucose_level','glucose_no_outliers', 'log_bmi_no_outliers', 'bmi_no_outliers','glucose_categorical_encoded'])]
print("Ramka wejściowa")
X_poly.info()

print("\nRamka wyjściowa po PolyFeatures transformacji")
df_poly = pd.DataFrame(poly.fit_transform(X_poly), columns = poly.get_feature_names_out(X_poly.columns))
df_poly.info()
Ramka wejściowa
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 249 to 248
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      5109 non-null   float64
 1   hypertension             5109 non-null   int64  
 2   heart_disease            5109 non-null   int64  
 3   stroke                   5109 non-null   int64  
 4   bmi_categorical_encoded  5109 non-null   float64
 5   is_female                5109 non-null   int32  
 6   is_married               5109 non-null   int32  
 7   work_type_encoded        5109 non-null   int32  
 8   is_rural                 5109 non-null   int32  
 9   log_glucose_no_outliers  5109 non-null   float64
dtypes: float64(3), int32(4), int64(3)
memory usage: 359.2 KB

Ramka wyjściowa po PolyFeatures transformacji
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 66 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   1                                                5109 non-null   float64
 1   age                                              5109 non-null   float64
 2   hypertension                                     5109 non-null   float64
 3   heart_disease                                    5109 non-null   float64
 4   stroke                                           5109 non-null   float64
 5   bmi_categorical_encoded                          5109 non-null   float64
 6   is_female                                        5109 non-null   float64
 7   is_married                                       5109 non-null   float64
 8   work_type_encoded                                5109 non-null   float64
 9   is_rural                                         5109 non-null   float64
 10  log_glucose_no_outliers                          5109 non-null   float64
 11  age^2                                            5109 non-null   float64
 12  age hypertension                                 5109 non-null   float64
 13  age heart_disease                                5109 non-null   float64
 14  age stroke                                       5109 non-null   float64
 15  age bmi_categorical_encoded                      5109 non-null   float64
 16  age is_female                                    5109 non-null   float64
 17  age is_married                                   5109 non-null   float64
 18  age work_type_encoded                            5109 non-null   float64
 19  age is_rural                                     5109 non-null   float64
 20  age log_glucose_no_outliers                      5109 non-null   float64
 21  hypertension^2                                   5109 non-null   float64
 22  hypertension heart_disease                       5109 non-null   float64
 23  hypertension stroke                              5109 non-null   float64
 24  hypertension bmi_categorical_encoded             5109 non-null   float64
 25  hypertension is_female                           5109 non-null   float64
 26  hypertension is_married                          5109 non-null   float64
 27  hypertension work_type_encoded                   5109 non-null   float64
 28  hypertension is_rural                            5109 non-null   float64
 29  hypertension log_glucose_no_outliers             5109 non-null   float64
 30  heart_disease^2                                  5109 non-null   float64
 31  heart_disease stroke                             5109 non-null   float64
 32  heart_disease bmi_categorical_encoded            5109 non-null   float64
 33  heart_disease is_female                          5109 non-null   float64
 34  heart_disease is_married                         5109 non-null   float64
 35  heart_disease work_type_encoded                  5109 non-null   float64
 36  heart_disease is_rural                           5109 non-null   float64
 37  heart_disease log_glucose_no_outliers            5109 non-null   float64
 38  stroke^2                                         5109 non-null   float64
 39  stroke bmi_categorical_encoded                   5109 non-null   float64
 40  stroke is_female                                 5109 non-null   float64
 41  stroke is_married                                5109 non-null   float64
 42  stroke work_type_encoded                         5109 non-null   float64
 43  stroke is_rural                                  5109 non-null   float64
 44  stroke log_glucose_no_outliers                   5109 non-null   float64
 45  bmi_categorical_encoded^2                        5109 non-null   float64
 46  bmi_categorical_encoded is_female                5109 non-null   float64
 47  bmi_categorical_encoded is_married               5109 non-null   float64
 48  bmi_categorical_encoded work_type_encoded        5109 non-null   float64
 49  bmi_categorical_encoded is_rural                 5109 non-null   float64
 50  bmi_categorical_encoded log_glucose_no_outliers  5109 non-null   float64
 51  is_female^2                                      5109 non-null   float64
 52  is_female is_married                             5109 non-null   float64
 53  is_female work_type_encoded                      5109 non-null   float64
 54  is_female is_rural                               5109 non-null   float64
 55  is_female log_glucose_no_outliers                5109 non-null   float64
 56  is_married^2                                     5109 non-null   float64
 57  is_married work_type_encoded                     5109 non-null   float64
 58  is_married is_rural                              5109 non-null   float64
 59  is_married log_glucose_no_outliers               5109 non-null   float64
 60  work_type_encoded^2                              5109 non-null   float64
 61  work_type_encoded is_rural                       5109 non-null   float64
 62  work_type_encoded log_glucose_no_outliers        5109 non-null   float64
 63  is_rural^2                                       5109 non-null   float64
 64  is_rural log_glucose_no_outliers                 5109 non-null   float64
 65  log_glucose_no_outliers^2                        5109 non-null   float64
dtypes: float64(66)
memory usage: 2.6 MB

As we can see, we have obtained all possible combinations of 2nd degree polynomial variables. However, we will exclude to start with variables that we will certainly not use such as bias: '1' and binary variables squared, as well as variables with 'stroke' interaction in order to avoid the so-called. data leakage.

In [44]:
col_drop = list(df_poly.filter(regex='stroke'))[1:] + ['1','hypertension^2','heart_disease^2','is_female^2','is_married^2','is_rural^2']
df_poly = df_poly[df_poly.columns.drop(col_drop)]

cat_features = ['is_rural','is_married', 'is_female', 'hypertension', 'heart_disease', 'work_type_encoded']

col_filter = set(list(df_poly.filter(regex='age')) + list(df_poly.filter(regex='log_glucose_no_outliers')) + cat_features + ['bmi_categorical_encoded','stroke'])
df_poly = df_poly[col_filter]
df_poly.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 27 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   age heart_disease                                5109 non-null   float64
 1   is_married log_glucose_no_outliers               5109 non-null   float64
 2   age is_married                                   5109 non-null   float64
 3   age hypertension                                 5109 non-null   float64
 4   age log_glucose_no_outliers                      5109 non-null   float64
 5   age bmi_categorical_encoded                      5109 non-null   float64
 6   is_married                                       5109 non-null   float64
 7   age is_rural                                     5109 non-null   float64
 8   bmi_categorical_encoded                          5109 non-null   float64
 9   log_glucose_no_outliers^2                        5109 non-null   float64
 10  is_female log_glucose_no_outliers                5109 non-null   float64
 11  stroke                                           5109 non-null   float64
 12  heart_disease log_glucose_no_outliers            5109 non-null   float64
 13  heart_disease                                    5109 non-null   float64
 14  is_rural log_glucose_no_outliers                 5109 non-null   float64
 15  age^2                                            5109 non-null   float64
 16  hypertension log_glucose_no_outliers             5109 non-null   float64
 17  work_type_encoded                                5109 non-null   float64
 18  work_type_encoded log_glucose_no_outliers        5109 non-null   float64
 19  is_female                                        5109 non-null   float64
 20  age is_female                                    5109 non-null   float64
 21  age work_type_encoded                            5109 non-null   float64
 22  bmi_categorical_encoded log_glucose_no_outliers  5109 non-null   float64
 23  log_glucose_no_outliers                          5109 non-null   float64
 24  hypertension                                     5109 non-null   float64
 25  is_rural                                         5109 non-null   float64
 26  age                                              5109 non-null   float64
dtypes: float64(27)
memory usage: 1.1 MB

Balancing data set ¶

TOC

  • Balancing the dataset using SMOTE, SMOTE+TOMEK, SMOTE ENN algorithms. When performing balancing, we will aim for a 50/50 ratio. In later stages, we will possibly return to the best balancing technique and try other class ratios.

It is important to balance the data only on the training set, because in general the test set is a set that is supposed to reflect the real data - and the balanced ones are not.

In [45]:
#Extracting the matrix of variables X and the vector of the target variable Y
X = df_poly.loc[:,~df_poly.columns.isin(['stroke'])]
Y = df_poly.stroke

#train-test split to balance only on the training set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=123)

#for visualization
X_train['index'] = X_train.reset_index().index
In [46]:
df_imbalanced = X_train.copy()
df_imbalanced['stroke'] = y_train
In [47]:
#plot showing the problem of unbalanced data - the class of stroke patients is almost invisible
plotting(data = df_imbalanced,x='index', y='age', type='scatter')
Out[47]:
0

SMOTE ¶

TOC

SMOTE uses the KNN algorithmic approach i.e. for each observation $x$ selects $k$ nearest neighbors and randomly selects one $x'$ of them. Then the difference between the $x$ and $x'$ coordinates is calculated and multiplied randomly by numbers in the interval $(0,1)$. The $x''$ observation thus created from the differences is added to the set. Geometrically, this maps the creation of a new observation based on the displacement of the current observation to one of its neighbors.

In [48]:
# SMOTE
df_smote, X_smote, y_smote = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm = 'SMOTE', strategy=1, k_neighbors = 7)
plotting(data = df_smote, x='index', y='age', type='scatter')
Liczba wszystkich obserwacji:  6530

Liczba obserwacji z klasy mniejszościowej:  3265

Procent klasy mniejszościowej do całości po zbalansowaniu:  0.5
Out[48]:
0

SMOTE+Tomek ¶

TOC

SMOTE + Tomek is a hybrid technique that combines the oversampling technique of the SMOTE algorithm with the undersampling technique of the Tomek algorithm. After using SMOTE, there is a possibility of overlapping class clusters, which can result in overfitting. In that case, the Tomek algorithm is additionally used, which involves pairing samples from different classes together, and then observations from the majority class are removed in order to (sometimes from both) separate the classes more clearly.

In [49]:
# SMOTE Tomek
df_smtom, X_smtom, y_smtom = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm='SMOTETOMEK', strategy = 1, k_neighbors=7)
plotting(data = df_smtom, x='index', y='age', type='scatter')
Liczba wszystkich obserwacji:  6432

Liczba obserwacji z klasy mniejszościowej:  3216

Procent klasy mniejszościowej do całości po zbalansowaniu:  0.5
Out[49]:
0

SMOTE + ENN ¶

TOC

SMOTE + ENN is a hybrid technique that involves the additional removal of a number of observations from the feature space. Once a collection is oversampled by SMOTE then ENN, which is an undersampling technique, is applied next. ENN (Edited Nearest Neighbor) involves finding the K nearest neighbors of each observation. It then checks if the majority class of these neighbors matches the class of the observation, if not then both the observation and the neighbors are removed. With this approach, the separation of classes from each other is clearer than in other cases.

In [50]:
# SMOTE ENN
df_smenn, X_smenn, y_smenn = balance_data(X=X_train, y=y_train, cat_features = cat_features, algorithm='SMOTEENN', strategy = 1)
plotting(data = df_smenn, x='index', y='age',type='scatter')
Liczba wszystkich obserwacji:  6320

Liczba obserwacji z klasy mniejszościowej:  3055

Procent klasy mniejszościowej do całości po zbalansowaniu:  0.4833860759493671
Out[50]:
0

We can see that each algorithm balances in its own specific way generates artificial observations for the minority class. In the SMOTE ENN algorithm, we can further see a decrease in the number of observations due to an explicit undersampling technique. We will perform model training on each set for each model to have a broader view of the results. Moreover, we will also perform LASSO regularization for each of them.

Metrics ¶

TOC

In general, for evaluating solutions to classification problems, a confusion matrix is used. image.png

It juxtaposes the true class values with the modeled class values. In the binary case, this can be represented as TP, or cases classified correctly into the true class, TN, or cases classified correctly into the false class, FP, or cases classified incorrectly into the true class, and FN, or cases classified incorrectly into the false class. FP values are so-called Type I error, that is, classifying something as true when in fact it is not. FN values, on the other hand, are type II error, that is, classifying something as false when in fact it is true.

Of this combination, the most popular metrics are:

  • $Accuracy = \frac{(TP+TN)}{(TP+FP+TN+FN)}$ i.e. the proportion of correct results among all possible ones.
  • $Precision = \frac{TP}{(TP+FP)}$ i.e. the precision of the model i.e. how many of those classified into the true class actually belong to it
  • $Recall = \frac{TP}{(TP+FN)}$ i.e. the proportion of correct results from the true class to all possible ones
  • $F_1 = 2\cdot \frac{precision \cdot recall}{precision + recall}$ i.e. the harmonic average between the Precision and Recall metrics

In our case, we want to focus on minimizing Type II error at the expense of Type I error. This is because if a Type I error is made, i.e., a false prediction of stroke, unnecessary tests may be performed and financial resources incurred, while a Type II error carries serious health consequences and, in the worst case, death.

However, it is not enough to minimize FN, because in such a case every subsequent patient would be classified as a stroke patient. Some balance is needed between FP and FN therefore we will be guided by the metrics Precision - that is, how many people classified with a stroke actually have a stroke (a high precision value implies a low FP value) and Recall (a high recall value implies a low FN value), as well as their harmonic mean $F_1$.

In summary, for the class of stroke patients, the more relevant metric is Recall, which tells us what is the proportion of people classified with stroke to all people who should be classified. We expect a high value, as this will mean that we are correctly searching for people with stroke.

For non-stroke patients, we will look more at the Precision metric, which in turn tells us how many classified as non-stroke actually do not have a stroke. A high value will mean that we are accurately searching for people without stroke.

Modelling ¶

TOC

There are many variables in our data set after the introduction of polynomial features. While we did not remove too many of them during data preprocessing and still the number of variables is much smaller than the number of observations, still an excessive number of them can lead to overfitting phenomena, affect explainability, or introduce unnecessary noise into the model.

Therefore, we want to reduce this number by selecting the most frequent variables (although other ways like dimensionality reduction through PCA, for example, are possible).

We will train 3 models:

  1. logistic regression

  2. the SVM or support vector machine.

  3. randomForest

Each model will be trained after prior feature selection using logistic regression with LASSO regularization. This regularization zeros out certain coefficients and it will be trained using several regularization parameters among which the best one for the ROC AUC metric will be selected.

After selecting these variables, we will train all models on this selected set, along with searching the grid of hyperparameters of each model maximizing the ROC AUC score which will help us select the final model.

In the last step, we will proceed to the selection of threshold $p$, which is the probability value from which we will classify stroke patients. For logistic regression and RFC, this is natural, as these methods are inherently probabilistic and their algorithms return values for classification probabilities. For SVM, we can set the probability=True hyperparameter value, so we are able to retrieve these probabilities.

The whole thing is done behind the scenes using logistic regression on the SVM results data (Platt's algorithm). This technique is not entirely reliable and has caveats, namely:

  • it is carried out through cross-validation so it is an expensive operation, especially for large data sets
  • the values of the probabilities may not be equal to the estimated classes, as the argmax of the result is not equal to the argmax of the probability in each case
  • as a result of the above, the class may be considered positive, even though the probability < 0.5

In the context of the caveats setting our own threshold, we will somehow control this thereby reducing the effectiveness of the SVM algorithm itself, and focusing on the effectiveness of the Platt technique in estimating these probabilities.

Selecting the balancing technique ¶

TOC

In [51]:
report = pd.DataFrame()
lr_base = LogisticRegressionWithThreshold(penalty='l1', solver='saga',max_iter=10000, random_state = 123)
for X, y, name in zip([X_smote, X_smtom, X_smenn],[y_smote,y_smtom, y_smenn], ['SMOTE', 'SMOTE+TOMEK','SMOTE+ENN']):
  print('\n'+name +'\n')
  columns_to_standardize = list(set(list(X.filter(regex='age')) + list(X.filter(regex='log_glucose_no_outliers'))))

  results = fit_pred_score(X, y, X_test, y_test, lr_base, dataset_name = name, model_name = 'Base LR', scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features, visualize_test = True, visualize_train = True)
  
  report = pd.concat([report, results])
  report.reset_index(inplace=True, drop=True)

report.sort_values(by=['model_name','metric','dataset_name'])
SMOTE

SMOTE+TOMEK

SMOTE+ENN

Out[51]:
No stroke Stroke accuracy macro avg weighted avg metric dataset_name model_name split
2 0.840624 0.840233 0.840429 0.840429 0.840429 f1-score SMOTE Base LR train
6 0.908547 0.246575 0.836892 0.577561 0.872818 f1-score SMOTE Base LR test
18 0.821209 0.820244 0.820728 0.820727 0.820743 f1-score SMOTE+ENN Base LR train
22 0.886301 0.265487 0.803084 0.575894 0.852794 f1-score SMOTE+ENN Base LR test
10 0.842220 0.842171 0.842195 0.842195 0.842195 f1-score SMOTE+TOMEK Base LR train
14 0.908364 0.258760 0.836892 0.583562 0.873302 f1-score SMOTE+TOMEK Base LR test
0 0.839597 0.841265 0.840429 0.840431 0.840431 precision SMOTE Base LR train
4 0.967422 0.164234 0.836892 0.565828 0.924071 precision SMOTE Base LR test
16 0.847005 0.795874 0.820728 0.821440 0.822289 precision SMOTE+ENN Base LR train
20 0.976604 0.166205 0.803084 0.571404 0.932863 precision SMOTE+ENN Base LR test
8 0.842089 0.842302 0.842195 0.842195 0.842195 precision SMOTE+TOMEK Base LR train
12 0.969417 0.171429 0.836892 0.570423 0.926346 precision SMOTE+TOMEK Base LR test
1 0.841654 0.839204 0.840429 0.840429 0.840429 recall SMOTE Base LR train
5 0.856426 0.494505 0.836892 0.675466 0.836892 recall SMOTE Base LR test
17 0.796937 0.846154 0.820728 0.821546 0.820728 recall SMOTE+ENN Base LR train
21 0.811285 0.659341 0.803084 0.735313 0.803084 recall SMOTE+ENN Base LR test
9 0.842351 0.842040 0.842195 0.842195 0.842195 recall SMOTE+TOMEK Base LR train
13 0.854545 0.527473 0.836892 0.691009 0.836892 recall SMOTE+TOMEK Base LR test
3 3265.000000 3265.000000 0.840429 6530.000000 6530.000000 support SMOTE Base LR train
7 1595.000000 91.000000 0.836892 1686.000000 1686.000000 support SMOTE Base LR test
19 3265.000000 3055.000000 0.820728 6320.000000 6320.000000 support SMOTE+ENN Base LR train
23 1595.000000 91.000000 0.803084 1686.000000 1686.000000 support SMOTE+ENN Base LR test
11 3216.000000 3216.000000 0.842195 6432.000000 6432.000000 support SMOTE+TOMEK Base LR train
15 1595.000000 91.000000 0.836892 1686.000000 1686.000000 support SMOTE+TOMEK Base LR test
In [52]:
plot_datasets_scores = transform_report_to_plot(report[(report.model_name=='Base LR') & (report.split=='test')], x='dataset_name', cl='metric')
plotting(data = plot_datasets_scores, x='', y='value', cl='metric', type='lineplot')
print("A graph of the dependence of the values of selected metrics on the balancing technique")
A graph of the dependence of the values of selected metrics on the balancing technique
In [53]:
report_diff = prepare_diff_dataset(report, by='dataset_name')
In [54]:
plot_diff_scores = transform_report_to_plot(report_diff[(report_diff.model_name=='Base LR') & (report_diff.train_test_diffaccuracy.isna()==False)], x='dataset_name', cl='metric', train_test=True)
plotting(data = plot_diff_scores, x='', y='value', cl='metric', type='lineplot')
print("Plot of the dependence of differences between training and test set values of selected metrics on balancing technique")
Plot of the dependence of differences between training and test set values of selected metrics on balancing technique

For a general look, we look at the 'macro-avg' metric as it does not take into account the proportion of classes (in terms of number of occurrences), and we want the 'Stroke' class to be as important in evaluating the model.

Thus for:

  • $F_1$ metric the best average value was obtained by SMOTE+TOMEK
  • Precision metric the best value for the class without stroke was obtained by SMOTE+ENN
  • the Recall metric the best value for the class with impact was obtained by SMOTE+ENN

In general, it can be seen that the model performs quite well without performing preprocessing and selecting hyperparameters. In our case, the choice of data balancing method is not likely to be very important, but 3 possibilities were selected in order to show different techniques as well as some impact on possible results. Here the differences are really small, but due to the higher values, as well as lower differences relative to the training and test set, we will choose the SMOTE+ENN hybrid method.

In [55]:
X = X_smenn.copy()
y = y_smenn.copy()
dataset_name = 'SMOTE+ENN'
columns_to_standardize = list(set(list(X.filter(regex='age')) + list(X.filter(regex='log_glucose_no_outliers')))) + ['bmi_categorical_encoded'] #bmi jest skategoryzowana, ale medianą w celu zachowania pewnej informacji w wartości dlatego ją standaryzujemy

X_train_preprocessed, X_test_preprocessed, standard_scaler = preprocess_data(X, X_test, scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features)

Feature selection using LASSO ¶

TOC

To select variables we will use the LASSO method in this case it will be logistic regression with a penalty function in the L1 metric. We will search the grid of C adjusting parameters (the lower the value, the more restrictive the penalty function). We will maximize the ROC AUC score, that is, maximize the number of true positive predictions while minimizing negative predictions. We will then select the appropriate variables from the best estimator using the SelectFromModel module.

In [56]:
lr_grid = {'C': [1, 0.01, 0.001, 0.0001, 0.00001]}
lr = LogisticRegressionWithThreshold(penalty='l1', solver='saga',max_iter=10000, random_state = 123)
lr_search = GridSearchCV(estimator = lr, param_grid = lr_grid, scoring = 'f1',
                          cv = 3, n_jobs = -1)
lr_search.fit(X_train_preprocessed,y)
Out[56]:
GridSearchCV(cv=3,
             estimator=LogisticRegressionWithThreshold(max_iter=10000,
                                                       penalty='l1',
                                                       random_state=123,
                                                       solver='saga'),
             n_jobs=-1, param_grid={'C': [1, 0.01, 0.001, 0.0001, 1e-05]},
             scoring='f1')
In [57]:
lr = lr_search.best_estimator_
selection_model = SelectFromModel(lr).fit(X_train_preprocessed, y)
coefs = {}
for coef, feature in zip(selection_model.estimator.coef_[0], selection_model.estimator.feature_names_in_):
  coefs[feature] = coef

X_selected = selection_model.transform(X_train_preprocessed)
print("The dimension of the initial dataset: ", X_train_preprocessed.shape)
print("The dimension of the set after the selection of variables: ", X_selected.shape)

coefs
The dimension of the initial dataset:  (6320, 26)
The dimension of the set after the selection of variables:  (6320, 26)
Out[57]:
{'age heart_disease': -0.392978351567809,
 'is_married log_glucose_no_outliers': -1.3696022295920751,
 'age is_married': -0.20652025298948656,
 'age hypertension': -0.5360502118541424,
 'age log_glucose_no_outliers': 3.5426850504075413,
 'age bmi_categorical_encoded': -1.8946165937878954,
 'is_married': 4.113051514840918,
 'age is_rural': -0.24886293590718095,
 'bmi_categorical_encoded': -0.2721730623484636,
 'log_glucose_no_outliers^2': 2.3209387643836163,
 'is_female log_glucose_no_outliers': 0.5107077922181571,
 'heart_disease log_glucose_no_outliers': 0.8344674630856949,
 'heart_disease': -1.578378610652275,
 'is_rural log_glucose_no_outliers': 0.3124455072056034,
 'age^2': -1.762631227290451,
 'hypertension log_glucose_no_outliers': 1.0446600053311177,
 'work_type_encoded': 0.06631646886871642,
 'work_type_encoded log_glucose_no_outliers': 0.2932920260518792,
 'is_female': -0.548647375395707,
 'age is_female': -0.2860917887792188,
 'age work_type_encoded': -0.32965127277763884,
 'bmi_categorical_encoded log_glucose_no_outliers': 1.2135344685285758,
 'log_glucose_no_outliers': -3.2697571834287418,
 'hypertension': -1.8745757969323777,
 'is_rural': -0.3836178773413026,
 'age': 3.299478869264678}

We see that no variable has been "removed" by LASSO by assigning zero weight.

Hyperparameters tuning ¶

TOC

The final step will be to tune the hyperparameters for each model. In logistic regression, we will choose values for the parameter $C$ responsible for regularization. For SVM, we will try different values of gamma, $C$ and linear kernel. For RFC, we will create a hyperparameter space that takes into account, among other things, the depth of the tree. We will use GridSearchCV from the sklearn package for this. The metric we will optimize will be the ROC AUC metric, which reflects the values shown in the graph above. The higher the AUC value, the higher the percentage of true-positive clsayifications with a low percentage of false-positive classifications. This metric allows for a good comparison of models and, like the previous ones chosen, correctly reflects our goal.

In [58]:
svm_grid = {'C': [1, 0.01, 0.0001],
              'gamma': [1, 0.01, 0.0001],
              'kernel': ['rbf', 'linear']}
rfc_grid = {
    'bootstrap': [True],
    'max_depth': [80, 100],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [8,12],
    'n_estimators': [100, 500, 1000]
}

# Create a based model
svm = SupportVectorMachineWithThreshold()
rfc = RandomForestClassifierWithThreshold(random_state = 123)
# Instantiate the grid search model

svm_search = GridSearchCV(estimator = svm, param_grid = svm_grid, scoring = 'f1',
                          cv = 3, n_jobs = -1, verbose = 2)
rfc_search = GridSearchCV(estimator = rfc, param_grid = rfc_grid, scoring = 'f1',
                          cv = 3, n_jobs = -1, verbose = 2)

for grid in [svm_search, rfc_search]:
  grid.fit(X_selected,y)
  print(grid.best_estimator_)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
SupportVectorMachineWithThreshold(C=1, gamma=1)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
                                    min_samples_leaf=3, min_samples_split=8,
                                    n_estimators=500, random_state=123)

So we got 3 models with the best parameters in terms of the ROC AUC metric from the searched parameter grids. Now we will put them together.

In [59]:
plt.figure(0).clf()

svm = SupportVectorMachineWithThreshold(gamma=1, C=1, kernel='rbf', probability=True, random_state = 123)
rfc = RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
                                    min_samples_leaf=3, min_samples_split=8,
                                    n_estimators=500, random_state=123)

#fit logistic regression model and plot ROC curve
lr.fit(X_selected, y)
svm.fit(X_selected, y)
rfc.fit(X_selected, y)
Out[59]:
RandomForestClassifierWithThreshold(max_depth=80, max_features=3,
                                    min_samples_leaf=3, min_samples_split=8,
                                    n_estimators=500, random_state=123)
<Figure size 800x550 with 0 Axes>
In [60]:
y_pred_lr = lr.predict(X_test_preprocessed)
y_pred_svm = svm.predict(X_test_preprocessed)
y_pred_rfc = rfc.predict(X_test_preprocessed)

fpr_lr, tpr_lr, _ = metrics.roc_curve(y_test, y_pred_lr)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr), 4)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, AUC="+str(auc))

fpr_svm, tpr_svm, _ = metrics.roc_curve(y_test, y_pred_svm)
auc = round(metrics.roc_auc_score(y_test, y_pred_svm), 4)
plt.plot(fpr_svm,tpr_svm,label="Support Vector Machine, AUC="+str(auc))

fpr_rfc, tpr_rfc, _ = metrics.roc_curve(y_test, y_pred_rfc)
auc = round(metrics.roc_auc_score(y_test, y_pred_rfc), 4)
plt.plot(fpr_rfc,tpr_rfc,label="Random Forest Classifier, AUC="+str(auc))

#add legend
plt.legend()
plt.show()

We see that among the best estimators for the selected parameter space, logistic regression performs best. In contrast, SVM and RFC perform only slightly better than the baseline estimator, i.e. predicting with a probability of 50%. It's like typing whether someone has a stroke by flipping a coin - SVM and RFC perform only slightly better for this scenario. So we'll limit ourselves to logistic regression and see if the choice of threshold $p$ affects our result.

Selecting threshold $p$ ¶

TOC

In [61]:
for p in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print('\nValue of p='+str(p)+' for Logistic Regression model\n')
    model_name = 'Logistic Regression with p='+str(p)
    results = fit_pred_score(X_selected, y, X_test_preprocessed, y_test, lr, dataset_name = dataset_name, model_name = model_name, scaler = False, ohe=False, 
                             visualize_test = True, visualize_train = True, threshold = p)
    
    report = pd.concat([report, results])
    report.reset_index(inplace=True, drop=True)

report.sort_values(by=['model_name','metric','dataset_name'])
Value of p=0.1 for Logistic Regression model

Value of p=0.2 for Logistic Regression model

Value of p=0.3 for Logistic Regression model

Value of p=0.4 for Logistic Regression model

Value of p=0.5 for Logistic Regression model

Value of p=0.6 for Logistic Regression model

Value of p=0.7 for Logistic Regression model

Value of p=0.8 for Logistic Regression model

Value of p=0.9 for Logistic Regression model

Out[61]:
No stroke Stroke accuracy macro avg weighted avg metric dataset_name model_name split
2 0.840624 0.840233 0.840429 0.840429 0.840429 f1-score SMOTE Base LR train
6 0.908547 0.246575 0.836892 0.577561 0.872818 f1-score SMOTE Base LR test
18 0.821209 0.820244 0.820728 0.820727 0.820743 f1-score SMOTE+ENN Base LR train
22 0.886301 0.265487 0.803084 0.575894 0.852794 f1-score SMOTE+ENN Base LR test
10 0.842220 0.842171 0.842195 0.842195 0.842195 f1-score SMOTE+TOMEK Base LR train
... ... ... ... ... ... ... ... ... ...
92 0.948214 0.666667 0.947212 0.807440 0.933018 precision SMOTE+ENN Logistic Regression with p=0.9 test
89 0.996937 0.284124 0.652373 0.640531 0.652373 recall SMOTE+ENN Logistic Regression with p=0.9 train
93 0.998746 0.043956 0.947212 0.521351 0.947212 recall SMOTE+ENN Logistic Regression with p=0.9 test
91 3265.000000 3055.000000 0.652373 6320.000000 6320.000000 support SMOTE+ENN Logistic Regression with p=0.9 train
95 1595.000000 91.000000 0.947212 1686.000000 1686.000000 support SMOTE+ENN Logistic Regression with p=0.9 test

96 rows × 9 columns

In [62]:
plot_datasets_scores = transform_report_to_plot(report[(report.model_name.str.contains('Logistic Regression with p')) & (report.split=='test')], x='model_name')
plotting(data = plot_datasets_scores, x='', y='value', cl='metric', type='lineplot')
Out[62]:
0
In [63]:
report_diff_p = prepare_diff_dataset(report, by='model_name')
In [64]:
plot_diff_scores_p = transform_report_to_plot(report_diff_p[(report_diff_p.model_name.str.contains('Logistic Regression with p')) & (report_diff_p.train_test_diffaccuracy.isna()==False)], x='model_name', cl='metric', train_test=True)
plotting(data = plot_diff_scores_p, x='', y='value', cl='metric', type='lineplot')
print("Plot of the dependence of differences between training and test set values of selected metrics on p-values")
#plot_diff_scores_p
Plot of the dependence of differences between training and test set values of selected metrics on p-values

Applications for:

  • Precision values:

    • for the non-stroke class is equal to almost 1 and decreases slightly with increasing $p$ values. This is due to the frequent assignment of individuals to the class with stroke through a low probability value, thus reducing the number of false positives for the class without stroke.
    • for the class with stroke is a low value and increases with an increase in the parameter $p$. The set is unbalanced and with a low probability we will often falsely classify people without stroke as people with stroke.
  • Recall values:

    • for the class without stroke increases significantly with an increase in threshold $p$. This is due to the fact that the higher the value of $p$, the more often we assign people to the class without stroke and thus minimize the value of false negatives for this class.
    • for the class with a stroke also starts at a value close to 1 (similar to Precision), but decreases significantly with an increase in the value of $p$. This is a natural consequence that as this parameter increases, the number of assignments to the class with stroke decreases (a high probability of belonging must be demonstrated), and so the number of patients "missed" when assigned to the class with stroke increases.

As we mentioned at the beginning, we want to maximize the Precision value for the non-impact class (0) and Recall for the impact class (1). In addition, high values of Precision for class (1) and Recall for class (0) will be an asset.

From a value of $p=0.2$ for LR, we can see a strong drop in Recall for the class with impact. At the point of decrease for class (0), the precision value is relatively high, and for class (1) there is a slight decrease relative to Recall. In addition, low $p$ values have low differences in metrics between the training and test set. They are also in line with the "domain sense," i.e., we want to be more attentive to the possibility of a stroke, and rather, a small probability of occurrence is already enough to classify a patient. Admittedly, the risk of false classification increases, but in the worst case, unnecessary costs will be incurred (for such a reason, the value cannot be too low either, since we will almost immediately classify everyone as having a stroke). On the other hand, too high a value for the required probability can put people's health and even lives at risk (this can be seen, for example, in the drastic drop in Recall for Class 1, which means that the number of people classified as having no stroke, where in fact there was one, has strongly increased).

We are interested in the values of $p \in (0.2, 0.3)$, because they have the high values of the Precision for (0) score and Recall for (1) score metrics that we established at the beginning. At the same time, these are the points at which the value of Precision for (1) and Recall for (0) begins to increase with $F_1$ score.

A value of $p=0.1$ would classify almost everyone as a stroke patient, which is an undesirable effect and no model is needed for such conclusions. In contrast, at values of $p > 0.3$ we see a strong drop in Recall for class 1, so we lose confidence that all stroke patients who should have been classified were indeed assigned to that class.

This leaves a decision between the higher values of the apriori set metrics and some domain intuition, since for $p=0.2$ there is also a risk of classifying too many "healthy" patients. So let's check the AUC values for these two parameters, which will tell us which scenario maximizes TPR with minimal FPR.

In [65]:
plt.figure(0).clf()

y_pred_lr1 = lr.predict(X_test_preprocessed, threshold=0.1)
y_pred_lr2 = lr.predict(X_test_preprocessed, threshold=0.2)
y_pred_lr3 = lr.predict(X_test_preprocessed, threshold=0.3)
y_pred_lr4 = lr.predict(X_test_preprocessed, threshold=0.4)


fpr_lr1, tpr_lr1, _ = metrics.roc_curve(y_test, y_pred_lr1)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr1), 4)
plt.plot(fpr_lr1,tpr_lr1,label="Logistic Regression with p=0.1, AUC="+str(auc))

fpr_lr2, tpr_lr2, _ = metrics.roc_curve(y_test, y_pred_lr2)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr2), 4)
plt.plot(fpr_lr2,tpr_lr2,label="Logistic Regression with p=0.2, AUC="+str(auc))

fpr_lr3, tpr_lr3, _ = metrics.roc_curve(y_test, y_pred_lr3)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr3), 4)
plt.plot(fpr_lr3,tpr_lr3,label="Logistic Regression with p=0.3, AUC="+str(auc))

fpr_lr4, tpr_lr4, _ = metrics.roc_curve(y_test, y_pred_lr4)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr4), 4)
plt.plot(fpr_lr4,tpr_lr4,label="Logistic Regression with p=0.4, AUC="+str(auc))

fpr_lr, tpr_lr, _ = metrics.roc_curve(y_test, y_pred_lr)
auc = round(metrics.roc_auc_score(y_test, y_pred_lr), 4)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression with p=0.5, AUC="+str(auc))

#add legend
plt.legend()
plt.show()

As we can see, the intuitions excerpted in the previous commentary were confirmed.

We achieved $AUC=0.7652$ for $p=0.2$ which is better than the default value of the parameter $p=0.5$ for which $AUC=0.7314$, so there is some improvement in the model. We can also see the fact that the high values of the metrics chosen apriori rightly maximize the AUC value while apparently taking into account the risk of classifying most patients as stroke patients. Evidently, this is a good direction for an unbalanced set like ours.

For $p=0.3$ we have $AUC=0.7521$ which is also not a bad result and after consulting a specialist perhaps would have been chosen as it assigns a class with a higher probability level.

You can also see how with an increase in the parameter $p$ the value of $AUC$ decreases.

Finally, we choose $p=0.2$ as the value of our main model. As a final step, we will prepare a whole ready pipeline for stroke prediction.

Example pipeline ¶

TOC

In [66]:
#Save final model
filename = 'final_lr_no_smoking_status_model.sav'
joblib.dump(lr, filename)
Out[66]:
['final_lr_no_smoking_status_model.sav']
In [67]:
def preprocess_final(X_new: pd.DataFrame, standard_scaler):
  """
  Function for preprocessing raw data. 
  It performs even unwanted operations because of the lack of time to prepare cleaning the code. 
  """
  df_clear = X_new
  
  ### BMI
  df_clear['bmi_no_nan'] = 28.1
  df_clear['bmi_no_outliers'] = df_clear.bmi_no_nan
  df_clear = iqr_outliers(df_clear, 'bmi_no_outliers')

  df_clear.loc[(df_clear.bmi_no_outliers < 18.5), 'bmi_categorical_encoded'] = 17.0
  df_clear.loc[(df_clear.bmi_no_outliers >= 18.5) & (df_clear.bmi_no_outliers < 24.9), 'bmi_categorical_encoded'] = 22.0
  df_clear.loc[(df_clear.bmi_no_outliers >= 24.9) & (df_clear.bmi_no_outliers < 29.9), 'bmi_categorical_encoded'] = 27.0
  df_clear.loc[(df_clear.bmi_no_outliers >= 29.9) & (df_clear.bmi_no_outliers < 34.9), 'bmi_categorical_encoded'] = 32.0
  df_clear.loc[(df_clear.bmi_no_outliers >= 34.9), 'bmi_categorical_encoded'] = 38.0

  ### GENDER
  df_clear = df_clear.loc[df_clear.gender != 'Other',:]
  df_clear.loc[df_clear.gender == 'Female', 'is_female'] = 1
  df_clear.loc[df_clear.gender == 'Male', 'is_female'] = 0
  df_clear.is_female = df_clear.is_female.astype('int')

  ### AGE
  df_clear = iqr_outliers(df=df_clear, col='age')

  ### EVER_MARRIED
  df_clear.loc[df_clear.ever_married == 'Yes', 'is_married'] = 1
  df_clear.loc[df_clear.ever_married == 'No', 'is_married'] = 0
  df_clear.is_married = df_clear.is_married.astype('int')

  ### WORK_TYPE
  le = preprocessing.LabelEncoder()
  le.fit(df_clear.work_type)
  df_clear['work_type_encoded'] = le.transform(df_clear.work_type)
  
  ### RESIDENCE_TYPE
  df_clear.loc[df_clear.Residence_type == 'Rural', 'is_rural'] = 1
  df_clear.loc[df_clear.Residence_type == 'Urban', 'is_rural'] = 0 
  df_clear.is_rural = df_clear.is_rural.astype('int')


  ### AVG_GLUCOSE_LEVEL
  df_clear['log_glucose_no_outliers'] = np.log(df_clear.avg_glucose_level)
  df_clear = iqr_outliers(df_clear, 'log_glucose_no_outliers')
  

  ### POLYNOMIAL FEATURES
  poly = preprocessing.PolynomialFeatures(2)

  #select only apriori chosen variables
  X_poly = df_clear.select_dtypes(['int32','int64','float32','float64'])
  X_poly = X_poly.loc[:,~X_poly.columns.isin(['id','smoking_status', 'bmi_no_nan','bmi','avg_glucose_level','glucose_no_outliers', 'log_bmi_no_outliers', 'bmi_no_outliers','glucose_categorical_encoded'])]
  df_poly = pd.DataFrame(poly.fit_transform(X_poly), columns = poly.get_feature_names_out(X_poly.columns))

  df_poly = df_poly[['age','age bmi_categorical_encoded','age heart_disease','age hypertension','age is_female',
                    'age is_married','age is_rural','age log_glucose_no_outliers','age work_type_encoded',
                     'age^2','bmi_categorical_encoded','bmi_categorical_encoded log_glucose_no_outliers','heart_disease','heart_disease log_glucose_no_outliers',
                     'hypertension','hypertension log_glucose_no_outliers','is_female','is_female log_glucose_no_outliers','is_married','is_married log_glucose_no_outliers',
                     'is_rural','is_rural log_glucose_no_outliers','log_glucose_no_outliers','log_glucose_no_outliers^2','work_type_encoded','work_type_encoded log_glucose_no_outliers']]
  
  cat_features = ['is_rural','is_married', 'is_female', 'hypertension', 'heart_disease', 'work_type_encoded']
  columns_to_standardize = list(set(list(df_poly.filter(regex='age')) + list(df_poly.filter(regex='log_glucose_no_outliers')))) + ['bmi_categorical_encoded'] #bmi jest skategoryzowana, ale medianą w celu zachowania pewnej informacji w wartości dlatego ją standaryzujemy

  
  ### STANDARDIZE + OHE  
  X_new_preprocessed = preprocess_data(None, df_poly, scaler = True, ohe=True, columns_to_standardize = columns_to_standardize, cat_features = cat_features, test_only=True, sc=standard_scaler)

  return X_new_preprocessed
In [68]:
#X_new
def predict_stroke(X : pd.DataFrame(), lr_model, standard_scaler):
  """
  Function predicts the strokes based on the given features using trained model.
  """
  #create model with the best parameters
  model = lr_model

  #preprocess data
  X_preprocessed = preprocess_final(X, standard_scaler)

  predicted_strokes = model.predict(X_preprocessed, threshold=0.3)
  
  return predicted_strokes
In [69]:
#Load previously trained model
loaded_model = joblib.load(filename)
loaded_model
Out[69]:
LogisticRegressionWithThreshold(C=1, max_iter=10000, penalty='l1',
                                random_state=123, solver='saga')
In [70]:
#Predict strokes
predict_stroke(df.loc[0:5,:], loaded_model, standard_scaler)
Out[70]:
array([0, 0, 0, 0, 0, 0])

Additional sources¶

  • http://cejsh.icm.edu.pl/cejsh/element/bwmeta1.element.desklight-002cb3e1-70f4-4321-a489-f4ced00e9d3b
  • https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/
  • https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50
  • https://machinelearningmastery.com/polynomial-features-transforms-for-machine-learning/
  • https://towardsdatascience.com/regularization-in-machine-learning-connecting-the-dots-c6e030bfaddd
  • https://en.wikipedia.org/wiki/Confusion_matrix
  • https://scikit-learn.org/stable/index.html
  • https://home.cs.colorado.edu/~mozer/Teaching/syllabi/6622/papers/Platt1999.pdf
In [ ]: